Analysis of World Happiness

Project goal

My goal in this project is to understand which features of a country’s government are correlated with greater happiness. I will use visual and predictive techniques to conduct my analysis.

library(tidyverse)
library(patchwork)

source("functions/prepareData.R")
happiness_orig <- read_csv("data/happiness_data.csv")
happiness_orig
# A tibble: 2,132 × 18
   country      year happiness happiness_sd log_gdp_per_capita social_support
   <chr>       <dbl>     <dbl>        <dbl>              <dbl>          <dbl>
 1 Afghanistan  2005     NA           NA                 NA            NA    
 2 Afghanistan  2006     NA           NA                 NA            NA    
 3 Afghanistan  2007     NA           NA                 NA            NA    
 4 Afghanistan  2008      3.72         1.77               7.17          0.451
 5 Afghanistan  2009      4.40         1.72               7.33          0.552
 6 Afghanistan  2010      4.76         1.88               7.39          0.539
 7 Afghanistan  2011      3.83         1.79               7.42          0.521
 8 Afghanistan  2012      3.78         1.80               7.52          0.521
 9 Afghanistan  2013      3.57         1.22               7.50          0.484
10 Afghanistan  2014      3.13         1.40               7.48          0.526
# ℹ 2,122 more rows
# ℹ 12 more variables: life_expectancy <dbl>, freedom_choices <dbl>,
#   generosity <dbl>, corruption <dbl>, positive_affect <dbl>,
#   negative_affect <dbl>, government_confidence <dbl>,
#   democratic_quality <dbl>, delivery_quality <dbl>, gini_gallup <dbl>,
#   gini_world_bank <dbl>, gini_world_bank_average <dbl>

Making a plan for predictability

My hope is that my findings will be relevant over time. Since I plan to just use the 2017 data to conduct my analysis, and I will use the 2016 data to evaluate the predictability. (No split is needed).

Cleaning and preprocessing the data for analysis

The following code creates a version of the cleaned/pre-processed data in which we:

  1. Drop any countries that are missing the 2017 happiness score

  2. Use the Gallup gini index value or world bank

  3. Use forward-fill imputation or interpolation

happiness_clean <- prepareData(happiness_orig, 
                               drop_countries_missing_2017_happiness = FALSE, 
                               gini = "gallup", 
                               imputation = "ffill")

Analysis

# filter to 2017
happiness_2017 <- happiness_clean |> 
  select(country, year, corruption, gini, happiness) |>
  drop_na() |>
  filter(year == 2017)

# scatterplot of corruption vs happiness
gg_corruption <- ggplot(happiness_2017, aes(x = corruption, y = happiness)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle(paste0("cor=", 
                 round(cor(happiness_2017$corruption, happiness_2017$happiness), 2)))

# scatterplot of gini index vs happiness
gg_gini <- ggplot(happiness_2017, aes(x = gini, y = happiness)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle(paste0("cor=", 
                 round(cor(happiness_2017$gini, happiness_2017$happiness), 2)))

# make a grid of plots using + from patchwork library
gg_corruption + gg_gini  
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

The following code creates a function for this analysis with the “year” as a an argument to make predictability analysis easy.

happinessAnalysis <- function(happiness_clean, 
                              .year = 2017) {
  
  
  # filter to the selected year
  happiness_year <- happiness_clean |> 
    select(country, year, corruption, gini, happiness) |>
    drop_na() |>
    filter(year == .year)
  
  # scatterplot of corruption vs happiness
  gg_corruption <- ggplot(happiness_year, aes(x = corruption, y = happiness)) +
    geom_point() +
    geom_smooth(method = "lm") +
    ggtitle(paste0(.year, " | cor=", 
                   round(cor(happiness_year$corruption, happiness_year$happiness), 2)))
  
  # scatterplot of gini index vs happiness
  gg_gini <- ggplot(happiness_year, aes(x = gini, y = happiness)) +
    geom_point() +
    geom_smooth(method = "lm") +
    ggtitle(paste0(.year, " | cor=", 
                   round(cor(happiness_year$gini, happiness_year$happiness), 2)))
  
  # make a grid of plots using + from patchwork library
  gg_corruption + gg_gini  
}
happinessAnalysis(happiness_clean, .year = 2015)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

Predictability analysis

happinessAnalysis(happiness_clean, .year = 2016)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

happinessAnalysis(happiness_clean, .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

Stability analysis

Judgment calls

Let’s consider the stability of our analysis to our judgment calls

# dropping countries missing 2017 happiness scores
happiness_clean_drop <- prepareData(happiness_orig, 
                                    drop_countries_missing_2017_happiness = TRUE,
                                    gini = "gallup", 
                                    imputation = "ffill")
happinessAnalysis(happiness_clean_drop, .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

# using interpolation to impute rather than forward fill
happiness_clean_interpolation <- prepareData(happiness_orig, 
                                             drop_countries_missing_2017_happiness = FALSE,
                                             gini = "gallup", 
                                             imputation = "interpolation")
happinessAnalysis(happiness_clean_interpolation, .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

# using world bank gini measure
happiness_clean_wb <- prepareData(happiness_orig, 
                                  drop_countries_missing_2017_happiness = FALSE,
                                  gini = "world bank", 
                                  imputation = "ffill")
happinessAnalysis(happiness_clean_wb, .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

# using world bank gini measure with interpolation
happiness_clean_wb_interpolation <- prepareData(happiness_orig, 
                                                drop_countries_missing_2017_happiness = FALSE,
                                                gini = "world bank", 
                                                imputation = "interpolation")
happinessAnalysis(happiness_clean_wb_interpolation, .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

Data perturbations

Since all countries are already included in our dataset, sampling-based perturbations do not really make sense because there is no sampling uncertainty at the country-level (there is at the respondent level within each country, but we don’t have access to that data).

However, the actual average variable value reported for each country has some uncertainty. While we have the standard deviation for each country’s happiness score, we do not have this value for the other values.

# create 10 perturbed versions of the data
perturbData <- function(happiness_data) {
  happiness_data_perturbed <- happiness_data |>
    # for each row:
    drop_na(happiness) |>
    rowwise() |>
    # add a random normally-distributed value to each happiness score within 1 SD
    mutate(happiness = happiness + rnorm(1, mean = 0, sd = happiness_sd)) |>
    ungroup()
  return(happiness_data_perturbed)
}
# original results:
happinessAnalysis(happiness_clean, .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

# perturbed versions of the results
happinessAnalysis(perturbData(happiness_clean), .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

happinessAnalysis(perturbData(happiness_clean), .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

happinessAnalysis(perturbData(happiness_clean), .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

happinessAnalysis(perturbData(happiness_clean), .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

happinessAnalysis(perturbData(happiness_clean), .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

happinessAnalysis(perturbData(happiness_clean), .year = 2017)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'